library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
This report analyzes DeepSEA functional annotations for fine-mapped eQTLs (LMM), comparing TST-responsive genes vs non-TST genes.
# Load eigenMT results
cat("Loading eigenMT results...\n")
## Loading eigenMT results...
eGenes <- fread(params$eigenMT_path)
eGenes_sig <- eGenes[eGenes$BF.FDR < params$FDR_threshold, ]
cat("Significant eGenes (FDR <", params$FDR_threshold, "):", nrow(eGenes_sig), "\n")
## Significant eGenes (FDR < 0.05 ): 7411
cat("Unique genes:", length(unique(eGenes_sig$gene)), "\n")
## Unique genes: 7411
cat("Unique SNPs:", length(unique(eGenes_sig$SNP)), "\n")
## Unique SNPs: 7189
# Load TST genes
cat("Loading TST genes...\n")
## Loading TST genes...
TST <- fread(params$TST_genes_path)
TST_filtered <- TST[TST$MaxFC >= params$FC_threshold, ]
cat("TST genes (FC >=", params$FC_threshold, "):", nrow(TST_filtered), "\n")
## TST genes (FC >= 0.58 ): 5526
# Split into TST and non-TST eGenes
eGenes_TST <- eGenes_sig[eGenes_sig$gene %in% TST_filtered$HGNC, ]
eGenes_nonTST <- eGenes_sig[!eGenes_sig$gene %in% TST_filtered$HGNC, ]
cat("TST eGenes:", nrow(eGenes_TST), "\n")
## TST eGenes: 1719
cat("Non-TST eGenes:", nrow(eGenes_nonTST), "\n")
## Non-TST eGenes: 5692
# Check for duplicated SNPs
dup_snps <- eGenes_sig[duplicated(eGenes_sig$SNP), ]
cat("Duplicated SNPs:", nrow(dup_snps), "\n")
## Duplicated SNPs: 222
cat("Loading DeepSEA 40 sequence class data...\n")
## Loading DeepSEA 40 sequence class data...
if (file.exists(params$deepsea_40seq_path)) {
DeepSEA_40 <- fread(params$deepsea_40seq_path)
cat("DeepSEA 40 sequence class data dimensions:", dim(DeepSEA_40), "\n")
# Merge with TST eGenes
DeepSEA_TST <- merge(eGenes_TST, DeepSEA_40, by.x = "SNP", by.y = "id")
cat("Merged TST eGenes with DeepSEA:", nrow(DeepSEA_TST), "\n")
# Get sequence class columns (assuming they start at column 18)
seq_cols <- colnames(DeepSEA_TST)[18:57]
cat("Number of sequence classes:", length(seq_cols), "\n")
cat("First few sequence classes:", head(seq_cols), "\n")
} else {
stop("DeepSEA 40 sequence class file not found: ", params$deepsea_40seq_path)
}
## DeepSEA 40 sequence class data dimensions: 7189 47
## Merged TST eGenes with DeepSEA: 1719
## Number of sequence classes: 40
## First few sequence classes: PC1 Polycomb / Heterochromatin L1 Low signal TN1 Transcription TN2 Transcription L2 Low signal E1 Stem cell
# Perform correlation analysis for each sequence class
cat("Performing correlation analysis for 40 sequence classes...\n")
## Performing correlation analysis for 40 sequence classes...
cor_test_estimate <- as.data.frame(sapply(DeepSEA_TST[, seq_cols, with = FALSE], function(x) {
cor.test(x, y = DeepSEA_TST$beta, method = "spearman")$estimate
}))
cor_test_p <- as.data.frame(sapply(DeepSEA_TST[, seq_cols, with = FALSE], function(x) {
cor.test(x, y = DeepSEA_TST$beta, method = "spearman")$p.value
}))
cor_results <- cbind(cor_test_estimate, cor_test_p)
colnames(cor_results) <- c("spearman.r", "p.value")
cor_results$class <- gsub(".rho", "", rownames(cor_results))
cor_results$fdr <- p.adjust(cor_results$p.value, method = "fdr", n = length(cor_results$p.value))
# Add significance annotation
cor_results$sig <- ifelse(cor_results$fdr < 0.05, "sig", "no")
cat("Significant correlations (FDR < 0.05):", sum(cor_results$fdr < 0.05), "\n")
## Significant correlations (FDR < 0.05): 17
# Plot correlation results
cat("Plotting correlation results...\n")
## Plotting correlation results...
ggplot(cor_results, aes(x = spearman.r, y = -log10(fdr), fill = sig)) +
scale_fill_manual(values = c("grey", "deepskyblue3")) +
geom_point(size = 3, shape = 21, colour = "grey4") +
theme_classic() +
ylab("-Log10 Adjusted P-value") +
xlab("Spearman Correlation Coefficient") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey50") +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
ggrepel::geom_text_repel(
data = cor_results[cor_results$fdr < 0.05, ],
aes(label = class),
max.overlaps = 100,
point.padding = unit(0.2, "lines"),
min.segment.length = 0,
size = 3
) +
theme(legend.position = "none") +
ggtitle("DeepSEA 40 Sequence Class Correlation with eQTL Effect Sizes\n(TST Genes)")
cat("Loading DeepSEA 21,907 features data...\n")
## Loading DeepSEA 21,907 features data...
if (file.exists(params$deepsea_features_path)) {
DeepSEA_features <- fread(params$deepsea_features_path)
cat("DeepSEA features data dimensions:", dim(DeepSEA_features), "\n")
# Merge with non-TST eGenes
DeepSEA_TST <- merge(eGenes_TST, DeepSEA_features, by.x = "SNP", by.y = "id")
cat("Merged non-TST eGenes with DeepSEA features:", nrow(DeepSEA_TST), "\n")
# Assuming features start at column 19 onwards
feature_cols <- colnames(DeepSEA_TST)[19:ncol(DeepSEA_TST)]
cat("Number of features:", length(feature_cols), "\n")
} else {
stop("DeepSEA features file not found: ", params$deepsea_features_path)
}
## DeepSEA features data dimensions: 7189 21915
## Merged non-TST eGenes with DeepSEA features: 1719
## Number of features: 21907
cat("Performing correlation analysis for features (this may take a while)...\n")
## Performing correlation analysis for features (this may take a while)...
# For large feature sets, we might want to sample or process in chunks
# Here we'll process all features but be aware it might be computationally intensive
cor_test_estimate_features <- as.data.frame(sapply(DeepSEA_TST[, feature_cols, with = FALSE], function(x) {
cor.test(x, y = DeepSEA_TST$beta, method = "spearman")$estimate
}))
cor_test_p_features <- as.data.frame(sapply(DeepSEA_TST[, feature_cols, with = FALSE], function(x) {
cor.test(x, y = DeepSEA_TST$beta, method = "spearman")$p.value
}))
cor_features <- cbind(cor_test_estimate_features, cor_test_p_features)
colnames(cor_features) <- c("spearman.r", "p.value")
cor_features$class <- gsub(".rho", "", rownames(cor_features))
# Parse class information
cor_features <- cor_features %>%
tidyr::separate(class, c("Cell.type", "chromatin.type", "dataset.id"), sep = "[|]", remove = FALSE)
cor_features$fdr <- p.adjust(cor_features$p.value, method = "fdr", n = length(cor_features$p.value))
cat("Significant feature correlations (FDR < 0.05):", sum(cor_features$fdr < 0.05), "\n")
## Significant feature correlations (FDR < 0.05): 8018
# Select top feature per chromatin type and cell type
cor_features$id <- paste0(cor_features$chromatin.type, "-", cor_features$Cell.type)
cor_features_top <- setDT(cor_features)[, .SD[which.min(p.value)], by = id]
# Add significance annotation
cor_features_top$sig <- ifelse(cor_features_top$fdr < 0.05, "yes", "no")
cat("Top features after grouping:", nrow(cor_features_top), "\n")
## Top features after grouping: 8618
cat("Significant top features:", sum(cor_features_top$fdr < 0.05), "\n")
## Significant top features: 3917
# Filter top 10 positive and negative by spearman.r
top_pos <- cor_features_top %>%
filter(fdr < 0.05) %>%
arrange(desc(spearman.r)) %>%
slice_head(n = 10)
top_neg <- cor_features_top %>%
filter(fdr < 0.05) %>%
arrange(spearman.r) %>%
slice_head(n = 10)
# Plot volcano plot
cat("Plotting volcano plot for top features...\n")
## Plotting volcano plot for top features...
ggplot(cor_features_top, aes(x = spearman.r, y = -log10(fdr), fill = sig)) +
scale_fill_manual(values = c("grey", "deepskyblue3")) +
geom_point(size = 2, shape = 21, colour = "grey4") +
theme_classic() +
ylab("-Log10 Adjusted P-value") +
xlab("Spearman Correlation Coefficient") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey50") +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
coord_cartesian(xlim = c(-0.2, 0.2)) +
# Label top 10 positive (right)
geom_text_repel(
data = top_pos,
aes(label = id),
hjust = 1,
direction = "y",
nudge_x = 0.2,
point.padding = 0,
size = 2
) +
# Label top 10 negative (left)
geom_text_repel(
data = top_neg,
aes(label = id),
hjust = 0,
direction = "y",
nudge_x = -0.2,
point.padding = 0,
size = 2
) +
theme(legend.position = "none") +
ggtitle("DeepSEA Feature Correlation with eQTL Effect Sizes\n(TST Genes, Top per Cell Type + Chromatin Type)")
cat("Comparing average predicted effects between TST and non-TST eQTLs...\n")
## Comparing average predicted effects between TST and non-TST eQTLs...
# Add TST annotation to DeepSEA data
DeepSEA_40$TSTs <- ifelse(DeepSEA_40$id %in% eGenes_TST$SNP, "TST", "non-TST")
cat("TST SNPs in DeepSEA data:", sum(DeepSEA_40$TSTs == "TST"), "\n")
## TST SNPs in DeepSEA data: 1703
cat("Non-TST SNPs in DeepSEA data:", sum(DeepSEA_40$TSTs == "non-TST"), "\n")
## Non-TST SNPs in DeepSEA data: 5486
# Calculate average scores per group
plot_data <- DeepSEA_40
chromatin_cols <- colnames(plot_data)[which(colnames(plot_data) == "PC1 Polycomb / Heterochromatin"):which(colnames(plot_data) == "HET6 Centromere")]
avg_df <- plot_data %>%
group_by(TSTs) %>%
summarise(across(all_of(chromatin_cols), mean, na.rm = TRUE)) %>%
pivot_longer(-TSTs, names_to = "Chromatin_State", values_to = "Average_Score")
# Reorder factor levels
avg_df$TSTs <- factor(avg_df$TSTs, levels = c("TST", "non-TST"))
cat("Average scores calculated for", length(unique(avg_df$Chromatin_State)), "chromatin states\n")
## Average scores calculated for 40 chromatin states
# Plot average scores comparison
ggplot(avg_df, aes(x = Chromatin_State, y = Average_Score, colour = TSTs)) +
geom_point(size = 3) +
scale_color_manual(values = c("TST" = "darkorange", "non-TST" = "cyan4")) +
facet_grid(TSTs ~ .) +
labs(
x = "Sequence Class",
y = "Average Predicted Effect"
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 8),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
legend.position = "none",
strip.background = element_blank(),
strip.text = element_text(size = 10, face = "bold")
) +
ggtitle("Average DeepSEA Predicted Effects by Sequence Class\nTST vs Non-TST eQTLs")
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/London
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggrepel_0.9.6 ggplot2_4.0.0 tidyr_1.3.1 dplyr_1.1.4
## [5] data.table_1.16.2
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_1.8.7 highr_0.10 compiler_4.3.2
## [5] Rcpp_1.0.13-1 tidyselect_1.2.1 jquerylib_0.1.4 scales_1.4.0
## [9] yaml_2.3.7 fastmap_1.1.1 R6_2.5.1 labeling_0.4.3
## [13] generics_0.1.3 knitr_1.45 tibble_3.2.1 bslib_0.5.1
## [17] pillar_1.9.0 RColorBrewer_1.1-3 R.utils_2.12.3 rlang_1.1.4
## [21] utf8_1.2.4 cachem_1.0.8 xfun_0.54 sass_0.4.7
## [25] S7_0.2.0 cli_3.6.3 withr_3.0.2 magrittr_2.0.3
## [29] digest_0.6.33 grid_4.3.2 lifecycle_1.0.4 R.oo_1.27.0
## [33] R.methodsS3_1.8.2 vctrs_0.6.5 evaluate_0.23 glue_1.8.0
## [37] farver_2.1.2 fansi_1.0.6 rmarkdown_2.25 purrr_1.0.2
## [41] tools_4.3.2 pkgconfig_2.0.3 htmltools_0.5.7